Working with audio

Getting started

The first to get started are similar to those follow in the first Tutorial.


In [1]:
%matplotlib inline

from IPython.display import Audio

import matplotlib.pyplot as plt
import numpy as np
import inspire

In [2]:
# Get the evaluation setting
setting = inspire.get_evaluation_setting()

# Get the dataset
dataset_filename = setting.download_dataset()
dataset = inspire.load_dataset(dataset_filename)

Token Audio

The next step is to download the dataset audio.


In [3]:
dataset_audio_filename = inspire.download_dataset_audio()

In [4]:
token_id = '36504'
sample_rate, signal_audio, noise_audio = inspire.get_token_audio(token_id, dataset_audio_filename, dataset)

In [5]:
plt.plot(noise_audio, label='noise')
plt.plot(signal_audio, label='speech')
plt.legend()
_ = plt.title('Audio of token {}'.format(token_id))



In [6]:
mix_audio = signal_audio + noise_audio

In [7]:
plt.plot(mix_audio, label='mix')
plt.legend()
_ = plt.title('Audio of token {}'.format(token_id))



In [8]:
Audio(data=signal_audio, rate=sample_rate)


Out[8]:

In [9]:
Audio(data=noise_audio, rate=sample_rate)


Out[9]:

In [10]:
Audio(data=mix_audio, rate=sample_rate)


Out[10]:

SNR-based Submission


In [11]:
# Get the lexicon
lexicon_filename = setting.download_lexicon()
lexicon = inspire.load_lexicon(lexicon_filename)

In [12]:
submission = inspire.Submission(email='dummy@email.com',
                                description='''SNR-based Trivial Alignment:
where task: For the phoneme positions we predict a 50% chance of observing a confusion. We predict a 1% chance of confusion at the inter-phoneme positions and a 30% chance of confusion at the positions before and after the utterance.
what task: Not provided.
full task: Not provided.
                                ''',
                                evaluation_setting=setting)

In [13]:
# Iterate over all the stimuli in our dataset
for token_id, token  in dataset['tokens'].items():
    if 'responses' not in token:
        continue
        
    # Lexicon may contain multiple pronunciations (we arbitrarily select the first)
    word = token['speech']
    pronunciation = lexicon[word][0]
    
    # Possible indices of confusions
    # are the number of phonemes plus
    # the number of positions around phonemes
    index_count = len(pronunciation)*2 + 1
    
    confusion_probabilities = np.zeros(index_count)

    # 
    sample_rate, signal_audio, noise_audio = inspire.get_token_audio(token_id, dataset_audio_filename, dataset)
    signal_power = np.array([x.sum() for x in np.array_split(np.square(signal_audio), index_count)])
    mix_power = np.array([x.sum() for x in np.array_split(np.square(signal_audio) + np.square(noise_audio), index_count)])
    
    smr = (signal_power / mix_power)
    
    confusion_probabilities[0] = 0.5
    confusion_probabilities[-1] = 0.5
    confusion_probabilities[2:-2:2] = 0.01
    
    confusion_probabilities[1::2] = (1.0 - smr[1::2])*0.3
    
    submission.where_task(token_id, confusion_probabilities)

In [14]:
submission.save('submission_snr_trivialalignment.json')
inspire.pprint(submission['tokens']['36504'])


{u'where': [0.5,
            0.30000001192092896,
            0.01,
            0.16731560230255127,
            0.01,
            0.066851258277893066,
            0.01,
            0.16067713499069214,
            0.01,
            0.2949715256690979,
            0.01,
            0.30000001192092896,
            0.5]}

In [ ]:
job = submission.evaluate(password='dummypassword')
job.wait()

In [17]:
result = job.result()

inspire.pprint(result['where']['token_averaged'])


-4.485098902356051

In [18]:
inspire.pprint(dataset['tokens']['36504'])


{u'noise_onset': 790069,
 u'noise_transcription': u'BAB8.txt',
 u'noise_type': u'bab8',
 u'noise_wav': u'BAB8.wav',
 u'responses': {u'manda': 2,
                u'mandas': 2,
                u'mando': 1,
                u'mandos': 1,
                u'mangos': 1,
                u'mano': 1,
                u'manos': 6,
                u'mantas': 1},
 u'signal_wav': u'T_36504.wav',
 u'snr': 0.4,
 u'speaker': u's1',
 u'speech': u'mandan'}

In [19]:
edit_scripts = inspire.get_edit_scripts(lexicon['mandan'][0], lexicon['manos'][0])
for edit_script in edit_scripts:
    inspire.print_edit_script(edit_script)
    print('---')


m ˈa n d a n 
m ˈa n   o s 
---
m ˈa n d a n 
m ˈa n o   s 
---
m ˈa n d a n 
m ˈa n o s   
---

In [ ]: